########
#### Input Parameters
########

# Baseline Tests | REMOVE for HPC
# input_stratified<-"Censored20"
# input_citation_deflation="Deflated"
# input_citation_window <- 5
# input_logp1="Log"

args <- commandArgs(TRUE)
#print(args)

input_discipline <- as.character(args[1]) #Discipline
input_stratified <- as.character(args[2]) #NotCensored or CensoredN || "Censored20" is the Baseline
input_citation_deflation <- as.character(args[3]) #NotDeflated or Deflated || "Deflated" is the Baseline
input_citation_window <- as.integer(args[4]) #10, 5, 2 || 5 is the Baseline
input_logp1 <- as.character(args[5]) #NoLog or Log || "Log" is the Baseline


# E.g., "Computer_Science" --> "Computer Science"
input_discipline <- gsub("_"," ",input_discipline)

print(input_discipline)
print(input_stratified)
print(input_citation_deflation)
print(input_citation_window)
print(input_logp1)

########
#### Measuring Nestedness: NODF | in R
#######
# Sources: 
# https://rdrr.io/cran/bipartite/man/nested.html#heading-9
# https://jeffollerton.wordpress.com/2019/08/12/weighted-nestedness-and-classical-nestedness-analyses-do-not-measure-the-same-thing-in-species-interaction-networks/
# https://fukamilab.github.io/BIO202/09-B-networks.html

########
#### Packages
#######
#install.packages("bipartite")
library("bipartite")
library("dplyr")
library("tidyr")
library("parallel")
library("foreach")
library("doParallel")
library("reshape2")
#install.packages("maxnodf")
library("maxnodf")
library("e1071")
library("ineq")

#library("remotes")
#remotes::install_github("PABalland/EconGeo")
#or
#install.packages("devtools",dependencies = TRUE)
#library(devtools)
#or
#install_github("PABalland/EconGeo")
#install.packages("remotes")
#library("remotes)
#remotes::install_github("PABalland/EconGeo")
library("EconGeo")

# Modularity 
# install.packages("lpbrim")
library("lpbrim")

# Ubiquity and diversity
library("diverse") 

########
#### Read in Files
#######
# List of Fields 
# Medicine
# Business
# Computer Science
# Biology
# Environmental Science
# Physics
# Economics
# Geology
# Mathematics
# Psychology
# Chemistry
# Materials Science

Raw_CountryCitation_Field_df <- read.csv("/Path/Input_Data/OUTPUT_Python_MAG_Ecology_Citations_GRID_Edgelist.csv.gz") %>%
  subset(FIELD==input_discipline)

# Diversity DataFrame
GRID_to_COUNTRY_df <- Raw_CountryCitation_Field_df%>%
  select(GRID,COUNTRY)%>%
  rename("GRIDs"="GRID","Country"="COUNTRY")%>%
  unique()


# Supplemental Materials | Stratified Sampling by Country
if(input_stratified!="NotCensored"){

  #E.g., if "Censored20" --> Top 20th Percentile, so nth_tile_<-5
  nth_tile_ <- 100/as.numeric(gsub("\\D", "", input_stratified))
  
  GRIDs_by_Country_df <- Raw_CountryCitation_Field_df%>%
    subset(YEAR_THAT_IS_CITING-YEAR<=input_citation_window)%>%
    group_by(FIELD,COUNTRY,GRID)%>%
    summarize(Presence_Count=sum(CITATIONS_RECEIVED))%>%
    group_by(FIELD,COUNTRY)%>%
    mutate(Nth_Tile = ntile(Presence_Count, nth_tile_))%>%
    group_by(FIELD,COUNTRY)%>%
    mutate(Number_of_Universities = n())%>%
    subset((Nth_Tile==nth_tile_&Number_of_Universities>1)|(Nth_Tile==1&Number_of_Universities==1))%>%
    select(FIELD,COUNTRY,GRID)%>%
    as.data.frame()
  
  
  Raw_CountryCitation_Field_df <- merge(GRIDs_by_Country_df,
                                    Raw_CountryCitation_Field_df,
                                    by=c("FIELD",'COUNTRY','GRID'),
                                    all.x=T) 
  
}

# Create Citation Deflation Index
if(input_citation_deflation=="Deflated"){
  
  Raw_CountryCitation_Field_df <- Raw_CountryCitation_Field_df %>%
  dplyr::mutate(CITATIONS_RECEIVED = CITATIONS_RECEIVED*(YEAR_NUMBER_OF_DOCUMENTS/YEAR_THAT_IS_CITING_NUMBER_OF_DOCUMENTS)) %>%
  as.data.frame()

}

# SM A | Citation Windows 
# Change 5 to other numbers: 2, 4, 6, 8, 10
CountryCitation_Field_df <- Raw_CountryCitation_Field_df%>%
  replace(is.na(.), 0)%>%
  subset(YEAR_THAT_IS_CITING-YEAR<=input_citation_window)%>%
  dplyr::group_by(FIELD,SUBFIELD,GRID,YEAR)%>%
  dplyr::summarise(CITATIONS_RECEIVED = sum(CITATIONS_RECEIVED))%>%
  as.data.frame()

# Need Complete List of Countries Per Field and SubField
CountryCitation_Field_df <- merge(CountryCitation_Field_df%>%
                                    dplyr::select(FIELD,SUBFIELD,GRID)%>%
                                    unique(),
                                  CountryCitation_Field_df%>%
                                    dplyr::select(YEAR)%>%
                                    unique(),
                                  all=T) %>%
  merge(.,CountryCitation_Field_df,
        by=c("FIELD","SUBFIELD","GRID","YEAR"),
        all.x=T)%>%
  replace(is.na(.), 0)

Matrix_df <- data.frame()
Ideal_Matrix_df <- data.frame()
NODF_df <- data.frame()
Diversity_df <- data.frame()
Ubiquity_df <- data.frame()

# SM A | Citation Window - Since the data go to 2017, need to subtract the citation window
# from the last number (e.g., 2017 - 5 = 2012)
for(year_ in c(1990:(2017-input_citation_window))){
  
Matrix_t_df <- tidyr::spread(CountryCitation_Field_df%>%
                                         subset(YEAR==year_)%>%
                                         subset(select=c("SUBFIELD","GRID","CITATIONS_RECEIVED"))%>%
                               dplyr::mutate(SUBFIELD=as.character(SUBFIELD),
                                             GRID=as.character(GRID)), 
                             "SUBFIELD", "CITATIONS_RECEIVED")%>% 
  replace(is.na(.), 0)

rownames(Matrix_t_df) <- Matrix_t_df$GRID
Matrix_t_df$GRID <- NULL

# SM C | Do we LOG our results?
# Comment out and run with our main discretization to see changes. 
if(input_logp1=="Log"){
Matrix_t_df <- Matrix_t_df %>% log1p()
}

# RR Update | (OLD SM D) How we discretize? | RCA
Matrix_t_df <- RCA(Matrix_t_df)
Matrix_t_df[Matrix_t_df<1]<-0
Matrix_t_df[Matrix_t_df>=1]<-1
Matrix_t_df <- Matrix_t_df %>%
  replace(is.na(.), 0)

Matrix_t_df <- Matrix_t_df %>% subset(rowSums(.)!=0)
Matrix_t_df <- Matrix_t_df %>% t() %>% subset(rowSums(.)!=0) %>% t()

Ideal_NODF_Matrix_df <- maxnodf(Matrix_t_df, quality=0)$max_nodf_mtx %>% melt() %>%
  dplyr::rename(RowRank=Var1,
                ColRank=Var2,
                Value=value)

RowNames_for_Ideal_NODF_Matrix_df <- data.frame(RowSums = rowSums(Matrix_t_df)[order(rowSums(Matrix_t_df),decreasing=T)],
                                                RowRank=seq(1:length(order(rowSums(Matrix_t_df)))))%>%
  dplyr::mutate(Nations=row.names(.))%>%
  dplyr::select(Nations,RowRank)

ColNames_for_Ideal_NODF_Matrix_df <- data.frame(ColSums = colSums(Matrix_t_df)[order(colSums(Matrix_t_df),decreasing=T)],
                                                ColRank=seq(1:length(order(colSums(Matrix_t_df)))))%>%
  dplyr::mutate(Topics=row.names(.))%>%
  dplyr::select(Topics,ColRank)

Ideal_NODF_Matrix_df <- Ideal_NODF_Matrix_df %>%
  merge(.,RowNames_for_Ideal_NODF_Matrix_df,by.x=c("RowRank"))%>%
  merge(.,ColNames_for_Ideal_NODF_Matrix_df,by.x=c("ColRank"))%>%
  dplyr::select(Nations,Topics,Value)%>%
  dplyr::mutate(Discipline=input_discipline,
                Year=year_)

Ideal_Matrix_df <- rbind(Ideal_Matrix_df,Ideal_NODF_Matrix_df)

nodf_ <- data.frame("NODF"=nodf_cpp(Matrix_t_df), #Note: Same as "NODF2" in nested()
                    "NODFc"=NODFc(Matrix_t_df),
                    "NODF_Normed"=nodf_cpp(Matrix_t_df)/maxnodf(Matrix_t_df, quality = 0)$max_nodf,
                    "Temperature"=nested(Matrix_t_df, "binmatnest", rescale=FALSE, normalised=TRUE),
                    "CScore"=nested(Matrix_t_df, "C score", rescale=FALSE, normalised=TRUE),
                    "Modules"=compart(Matrix_t_df)$n.compart, # Note, need Ordered 
                    "Gini_Labels"=ineq::Gini(rowSums(Matrix_t_df)),
                    "Year"=year_,
                    "Discipline"=input_discipline,
                    "Sample_Stratified"=input_stratified,
                    "Citation_Deflated"=input_citation_deflation,
                    "Citation_Window"=as.character(input_citation_window),
                    "Citation_Logged"=input_logp1,
                    "Number_of_Links"=sum(Matrix_t_df),
                    "Acutal_K"=dim(Matrix_t_df)[2],
                    "Labels_Present"=dim(Matrix_t_df)[1], #Note: Don't use _Discrete df
                    "Actual_Labels_Present"=dim(Matrix_t_df)[1])

row.names(nodf_)<-0

NODF_df = rbind(NODF_df,nodf_)

Matrix_df = rbind(Matrix_df,melt(Matrix_t_df)%>%
                    dplyr::mutate(Discipline=input_discipline,
                           Year=year_)%>%
                    dplyr::rename(Nations=Var1,
                                  Topics=Var2,
                                  Value=value))

diversity_ <- data.frame(diverse::diversity(Matrix_t_df,type = "herfindahl-hirschman"),diverse::diversity(Matrix_t_df,type = "entropy"),diverse::diversity(Matrix_t_df,type = "blau"),diverse::diversity(Matrix_t_df,type = "simpson")) %>%
  dplyr::rename("Blau"='blau.index',"Shannon_Entropy"="entropy")
diversity_$"Year"=year_
diversity_$"Discipline"=input_discipline
diversity_$"GRIDs"=row.names(diversity_)
row.names(diversity_)<-NULL
Diversity_df <- rbind(Diversity_df,diversity_)

ubiquity_ <- data.frame(diverse::ubiquity(Matrix_t_df),diverse::diversity(t(Matrix_t_df),type = "entropy"))
ubiquity_$"Year"=year_
ubiquity_$"Discipline"= input_discipline
ubiquity_$"Subfields"=row.names(ubiquity_)
row.names(ubiquity_)<-NULL
Ubiquity_df <- rbind(Ubiquity_df,ubiquity_)

}


Diversity_df <- merge(Diversity_df,GRID_to_COUNTRY_df,by=c("GRIDs"),all.x=T)

Filename_Diversity = paste("/Path/Data/OUTPUT_R_NODF_Diversity_",input_stratified,"_",input_citation_deflation,"_Window",as.character(input_citation_window),"_",input_logp1,"_",gsub(" ","",input_discipline),".csv.gz",sep="")
Diversity_df$"Sample_Stratified"=input_stratified
Diversity_df$"Citation_Deflated"=input_citation_deflation
Diversity_df$"Citation_Window"=as.character(input_citation_window)
Diversity_df$"Citation_Logged"=input_logp1
write.csv(Diversity_df,file=gzfile(Filename_Diversity),row.names = F)

# Ubiquity DataFrame
Filename_Ubiquity = paste("/Path/Data/OUTPUT_R_NODF_Ubiquity_",input_stratified,"_",input_citation_deflation,"_Window",as.character(input_citation_window),"_",input_logp1,"_",gsub(" ","",input_discipline),".csv.gz",sep="")
Ubiquity_df$"Sample_Stratified"=input_stratified
Ubiquity_df$"Citation_Deflated"=input_citation_deflation
Ubiquity_df$"Citation_Window"=as.character(input_citation_window)
Ubiquity_df$"Citation_Logged"=input_logp1
write.csv(Ubiquity_df,file=gzfile(Filename_Ubiquity),row.names = F)

Filename_NODF = paste("/Path/Data/OUTPUT_R_NODF_Citations_",input_stratified,"_",input_citation_deflation,"_Window",as.character(input_citation_window),"_",input_logp1,"_",gsub(" ","",input_discipline),".csv.gz",sep="")
write.csv(NODF_df,file=gzfile(Filename_NODF),row.names = F)

Filename_NODF_Matrix = paste("/Path/Data/OUTPUT_R_NODF_Matrix_Citations_",input_stratified,"_",input_citation_deflation,"_Window",as.character(input_citation_window),"_",input_logp1,"_",gsub(" ","",input_discipline),".csv.gz",sep="")
Matrix_df$"Sample_Stratified"=input_stratified
Matrix_df$"Citation_Deflated"=input_citation_deflation
Matrix_df$"Citation_Window"=as.character(input_citation_window)
Matrix_df$"Citation_Logged"=input_logp1
write.csv(Matrix_df,file=gzfile(Filename_NODF_Matrix),row.names = F)

Filename_Ideal_NODF_Matrix = paste("/Path/Data/OUTPUT_R_Ideal_NODF_Matrix_Citations_",input_stratified,"_",input_citation_deflation,"_Window",as.character(input_citation_window),"_",input_logp1,"_",gsub(" ","",input_discipline),".csv.gz",sep="")
Ideal_Matrix_df$"Sample_Stratified"=input_stratified
Ideal_Matrix_df$"Citation_Deflated"=input_citation_deflation
Ideal_Matrix_df$"Citation_Window"=as.character(input_citation_window)
Ideal_Matrix_df$"Citation_Logged"=input_logp1
write.csv(Ideal_Matrix_df,file=gzfile(Filename_Ideal_NODF_Matrix),row.names = F)


# Compare Ideal t with Raw t
# Find 'Predictions' or where misses (Deltas) happen
# Then compare these 'Predictions' with the future t+n raw matrices
# And count how many became actual predictions and how many become errors


Predicted_Values_df_t <- Ideal_Matrix_df %>% 
  dplyr::rename(Predicted_Value=Value)%>%
  merge(.,Matrix_df %>%
        dplyr::rename(Actual_Value=Value),
      by=c("Nations","Topics","Discipline","Year"),all.x=T) %>%
  as.data.frame() %>%
  dplyr::mutate(Delta = Predicted_Value-Actual_Value)%>%
  dplyr::mutate(Delta_Type = case_when(Delta==1 ~ "Predicted_Presence",
                                       Delta==-1 ~ "Predicted_Absence",
                                       Delta==0 & Predicted_Value==0 ~ "Consistent_Absence",
                                       Delta==0 & Predicted_Value==1 ~ "Consistent_Presence"))
  

Predicted_Values_df_Future_t <- Predicted_Values_df_t %>%  
        dplyr::select(Nations,Topics,Discipline,Year) %>% 
        unique() %>%
        dplyr::slice(rep(1:n(), each = input_citation_window)) %>%
        dplyr::group_by(Discipline,Year,Nations,Topics)%>%
        dplyr::mutate(Year_T = 1:n())%>%
        dplyr::mutate(Year_T = Year + Year_T)%>%
        merge(.,Predicted_Values_df_t,
              by=c("Nations","Topics","Discipline","Year"),all.x=T) %>%
        merge(.,Matrix_df %>%
                dplyr::rename(Year_T=Year),
              by=c("Nations","Topics","Discipline","Year_T"),all.x=T) %>%
        dplyr::rename(Actual_Future_Value = Value) %>%
        as.data.frame() %>%
          dplyr::mutate(Future_Delta = Predicted_Value-Actual_Future_Value)%>%
  dplyr::mutate(Future_Delta_Type = case_when(Delta_Type=="Predicted_Presence" & Actual_Future_Value==1 ~ "Correct_Presence",
                                              Delta_Type=="Predicted_Presence" & Actual_Future_Value==0 ~ "Incorrect_Presence",
                                              Delta_Type=="Predicted_Absence" & Actual_Future_Value==0 ~ "Correct_Absence",
                                              Delta_Type=="Predicted_Absence" & Actual_Future_Value==1 ~ "Incorrect_Absence",
                                              Delta_Type=="Consistent_Absence" & Future_Delta==0 ~ "Consistent_Absence",
                                              Delta_Type=="Consistent_Absence" & Future_Delta!=0 ~ "Inconsistent_Absence",                                           
                                              Delta_Type=="Consistent_Presence" & Future_Delta==0 ~ "Consistent_Presence",
                                              Delta_Type=="Consistent_Presence" & Future_Delta!=0 ~ "Inconsistent_Presence"))


Percent_Predicted_Values_df_Future_t <- Predicted_Values_df_Future_t %>%
  drop_na()%>%
  dplyr::group_by(Year,Year_T,Discipline,Delta_Type,Future_Delta_Type)%>%
  dplyr::summarise(N=n())%>%
  dplyr::group_by(Year,Year_T,Discipline,Delta_Type)%>%
  dplyr::mutate(Percent=N/sum(N))%>%
  dplyr::mutate(Delta_T = Year_T-Year )%>%
  as.data.frame()


# Note: Change name for SM
Filename_Predicted = paste("/Path/Data/OUTPUT_R_Predicted_Values_",input_stratified,"_",input_citation_deflation,"_Window",as.character(input_citation_window),"_",input_logp1,"_",gsub(" ","",input_discipline),".csv.gz",sep="")
Percent_Predicted_Values_df_Future_t$"Sample_Stratified"=input_stratified
Percent_Predicted_Values_df_Future_t$"Citation_Deflated"=input_citation_deflation
Percent_Predicted_Values_df_Future_t$"Citation_Window"=as.character(input_citation_window)
Percent_Predicted_Values_df_Future_t$"Citation_Logged"=input_logp1
write.csv(Percent_Predicted_Values_df_Future_t,file=gzfile(Filename_Predicted),row.names = F)


# Plot | Sanity Check
# ggplot(data=Percent_Predicted_Values_df_Future_t%>%
#           subset(Delta_Type=="Predicted_Presence")%>%
#          merge(.,NODF_df,by=c("Discipline","Year"),all.x=T),
#         aes(x=NODF_Normed,y=Percent,group=Discipline,colour=Future_Delta_Type))+
#   geom_point(aes(group=Discipline))+
#   geom_smooth(aes(group=Future_Delta_Type),method="lm")+
#   facet_grid(~Delta_T)

